{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np  #矩阵操作\n",
    "import pandas as pd #sql数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CRIM</th>\n",
       "      <th>ZN</th>\n",
       "      <th>INDUS</th>\n",
       "      <th>CHAS</th>\n",
       "      <th>NOX</th>\n",
       "      <th>RM</th>\n",
       "      <th>AGE</th>\n",
       "      <th>DIS</th>\n",
       "      <th>RAD</th>\n",
       "      <th>TAX</th>\n",
       "      <th>PTRATIO</th>\n",
       "      <th>LSTAT</th>\n",
       "      <th>MEDV</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.00632</td>\n",
       "      <td>18.0</td>\n",
       "      <td>2.31</td>\n",
       "      <td>0</td>\n",
       "      <td>0.538</td>\n",
       "      <td>6.575</td>\n",
       "      <td>65.2</td>\n",
       "      <td>4.0900</td>\n",
       "      <td>1</td>\n",
       "      <td>296</td>\n",
       "      <td>15.3</td>\n",
       "      <td>4.98</td>\n",
       "      <td>24.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.02731</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.07</td>\n",
       "      <td>0</td>\n",
       "      <td>0.469</td>\n",
       "      <td>6.421</td>\n",
       "      <td>78.9</td>\n",
       "      <td>4.9671</td>\n",
       "      <td>2</td>\n",
       "      <td>242</td>\n",
       "      <td>17.8</td>\n",
       "      <td>9.14</td>\n",
       "      <td>21.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.02729</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.07</td>\n",
       "      <td>0</td>\n",
       "      <td>0.469</td>\n",
       "      <td>7.185</td>\n",
       "      <td>61.1</td>\n",
       "      <td>4.9671</td>\n",
       "      <td>2</td>\n",
       "      <td>242</td>\n",
       "      <td>17.8</td>\n",
       "      <td>4.03</td>\n",
       "      <td>34.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.03237</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.18</td>\n",
       "      <td>0</td>\n",
       "      <td>0.458</td>\n",
       "      <td>6.998</td>\n",
       "      <td>45.8</td>\n",
       "      <td>6.0622</td>\n",
       "      <td>3</td>\n",
       "      <td>222</td>\n",
       "      <td>18.7</td>\n",
       "      <td>2.94</td>\n",
       "      <td>33.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.06905</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.18</td>\n",
       "      <td>0</td>\n",
       "      <td>0.458</td>\n",
       "      <td>7.147</td>\n",
       "      <td>54.2</td>\n",
       "      <td>6.0622</td>\n",
       "      <td>3</td>\n",
       "      <td>222</td>\n",
       "      <td>18.7</td>\n",
       "      <td>5.33</td>\n",
       "      <td>36.2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  PTRATIO  \\\n",
       "0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   \n",
       "1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   \n",
       "2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   \n",
       "3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   \n",
       "4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   \n",
       "\n",
       "   LSTAT  MEDV  \n",
       "0   4.98  24.0  \n",
       "1   9.14  21.6  \n",
       "2   4.03  34.7  \n",
       "3   2.94  33.4  \n",
       "4   5.33  36.2  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\"boston.csv\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 506 entries, 0 to 505\n",
      "Data columns (total 13 columns):\n",
      "CRIM       506 non-null float64\n",
      "ZN         506 non-null float64\n",
      "INDUS      506 non-null float64\n",
      "CHAS       506 non-null int64\n",
      "NOX        506 non-null float64\n",
      "RM         506 non-null float64\n",
      "AGE        506 non-null float64\n",
      "DIS        506 non-null float64\n",
      "RAD        506 non-null int64\n",
      "TAX        506 non-null int64\n",
      "PTRATIO    506 non-null float64\n",
      "LSTAT      506 non-null float64\n",
      "MEDV       506 non-null float64\n",
      "dtypes: float64(10), int64(3)\n",
      "memory usage: 51.5 KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(490, 13)\n"
     ]
    }
   ],
   "source": [
    "#删除y大于等于50的样本\n",
    "df = df[df.MEDV < 50]\n",
    "\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#从原始数据中分离输入特征x和输出y\n",
    "y = df['MEDV']\n",
    "X = df.drop('MEDV', axis = 1)\n",
    "\n",
    "#尝试对y作log变换，对log变换后的价格进行评估\n",
    "log_y = np.log1p(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "X[\"RAD\"].astype(\"object\")\n",
    "x_cat = X[\"RAD\"]\n",
    "x_cat = pd.get_dummies(x_cat, prefix = \"RAD\")\n",
    "\n",
    "x = X.drop(\"RAD\", axis = 1)\n",
    "\n",
    "#特征名称，用于保存特征工程结果\n",
    "feat_name = X.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>RAD_1</th>\n",
       "      <th>RAD_2</th>\n",
       "      <th>RAD_3</th>\n",
       "      <th>RAD_4</th>\n",
       "      <th>RAD_5</th>\n",
       "      <th>RAD_6</th>\n",
       "      <th>RAD_7</th>\n",
       "      <th>RAD_8</th>\n",
       "      <th>RAD_24</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   RAD_1  RAD_2  RAD_3  RAD_4  RAD_5  RAD_6  RAD_7  RAD_8  RAD_24\n",
       "0      1      0      0      0      0      0      0      0       0\n",
       "1      0      1      0      0      0      0      0      0       0\n",
       "2      0      1      0      0      0      0      0      0       0\n",
       "3      0      0      1      0      0      0      0      0       0\n",
       "4      0      0      1      0      0      0      0      0       0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_cat.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#数据标准化\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "#分别初始化对特征值和目标值的标准化器\n",
    "ss_X = StandardScaler()\n",
    "ss_y = StandardScaler()\n",
    "\n",
    "ss_log_y = StandardScaler()\n",
    "\n",
    "#分别对训练和测试数据的特征以及目标值进行标准化处理\n",
    "#对训练数据，先调用fit方法训练模型，得到模型参数；然后对训练数据和测试数据进行transform\n",
    "X = ss_X.fit_transform(X)\n",
    "\n",
    "#对y做标准化不是必须\n",
    "#对y标准化的好处是不同问题的w差异不太大，同时正则参数的范围也有限\n",
    "#y = ss_y.fit_transform(y.reshape(-1, 1))\n",
    "#log_y = ss_y.fit_transform(log_y.reshape(-1, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "fe_data = pd.DataFrame(data = X, columns = feat_name, index = df.index)\n",
    "fe_data = pd.concat([fe_data, x_cat], axis = 1, ignore_index = False)\n",
    "\n",
    "#加上标签\n",
    "fe_data[\"MEDV\"] = y\n",
    "fe_data[\"log_MEDV\"] = log_y\n",
    "\n",
    "fe_data.to_csv('FE_boston.csv', index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CRIM</th>\n",
       "      <th>ZN</th>\n",
       "      <th>INDUS</th>\n",
       "      <th>CHAS</th>\n",
       "      <th>NOX</th>\n",
       "      <th>RM</th>\n",
       "      <th>AGE</th>\n",
       "      <th>DIS</th>\n",
       "      <th>RAD</th>\n",
       "      <th>TAX</th>\n",
       "      <th>...</th>\n",
       "      <th>RAD_2</th>\n",
       "      <th>RAD_3</th>\n",
       "      <th>RAD_4</th>\n",
       "      <th>RAD_5</th>\n",
       "      <th>RAD_6</th>\n",
       "      <th>RAD_7</th>\n",
       "      <th>RAD_8</th>\n",
       "      <th>RAD_24</th>\n",
       "      <th>MEDV</th>\n",
       "      <th>log_MEDV</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-0.417401</td>\n",
       "      <td>0.301813</td>\n",
       "      <td>-1.291856</td>\n",
       "      <td>-0.250812</td>\n",
       "      <td>-0.139895</td>\n",
       "      <td>0.505040</td>\n",
       "      <td>-0.109432</td>\n",
       "      <td>0.121208</td>\n",
       "      <td>-0.980635</td>\n",
       "      <td>-0.667101</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>24.0</td>\n",
       "      <td>3.218876</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.414992</td>\n",
       "      <td>-0.486924</td>\n",
       "      <td>-0.593329</td>\n",
       "      <td>-0.250812</td>\n",
       "      <td>-0.731821</td>\n",
       "      <td>0.269017</td>\n",
       "      <td>0.377488</td>\n",
       "      <td>0.537330</td>\n",
       "      <td>-0.865459</td>\n",
       "      <td>-0.988734</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>21.6</td>\n",
       "      <td>3.117950</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>-0.414995</td>\n",
       "      <td>-0.486924</td>\n",
       "      <td>-0.593329</td>\n",
       "      <td>-0.250812</td>\n",
       "      <td>-0.731821</td>\n",
       "      <td>1.439934</td>\n",
       "      <td>-0.255152</td>\n",
       "      <td>0.537330</td>\n",
       "      <td>-0.865459</td>\n",
       "      <td>-0.988734</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>34.7</td>\n",
       "      <td>3.575151</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>-0.414412</td>\n",
       "      <td>-0.486924</td>\n",
       "      <td>-1.310933</td>\n",
       "      <td>-0.250812</td>\n",
       "      <td>-0.826186</td>\n",
       "      <td>1.153335</td>\n",
       "      <td>-0.798939</td>\n",
       "      <td>1.056878</td>\n",
       "      <td>-0.750284</td>\n",
       "      <td>-1.107857</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>33.4</td>\n",
       "      <td>3.538057</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-0.410202</td>\n",
       "      <td>-0.486924</td>\n",
       "      <td>-1.310933</td>\n",
       "      <td>-0.250812</td>\n",
       "      <td>-0.826186</td>\n",
       "      <td>1.381694</td>\n",
       "      <td>-0.500390</td>\n",
       "      <td>1.056878</td>\n",
       "      <td>-0.750284</td>\n",
       "      <td>-1.107857</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>36.2</td>\n",
       "      <td>3.616309</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       CRIM        ZN     INDUS      CHAS       NOX        RM       AGE  \\\n",
       "0 -0.417401  0.301813 -1.291856 -0.250812 -0.139895  0.505040 -0.109432   \n",
       "1 -0.414992 -0.486924 -0.593329 -0.250812 -0.731821  0.269017  0.377488   \n",
       "2 -0.414995 -0.486924 -0.593329 -0.250812 -0.731821  1.439934 -0.255152   \n",
       "3 -0.414412 -0.486924 -1.310933 -0.250812 -0.826186  1.153335 -0.798939   \n",
       "4 -0.410202 -0.486924 -1.310933 -0.250812 -0.826186  1.381694 -0.500390   \n",
       "\n",
       "        DIS       RAD       TAX  ...  RAD_2  RAD_3  RAD_4  RAD_5  RAD_6  \\\n",
       "0  0.121208 -0.980635 -0.667101  ...      0      0      0      0      0   \n",
       "1  0.537330 -0.865459 -0.988734  ...      1      0      0      0      0   \n",
       "2  0.537330 -0.865459 -0.988734  ...      1      0      0      0      0   \n",
       "3  1.056878 -0.750284 -1.107857  ...      0      1      0      0      0   \n",
       "4  1.056878 -0.750284 -1.107857  ...      0      1      0      0      0   \n",
       "\n",
       "   RAD_7  RAD_8  RAD_24  MEDV  log_MEDV  \n",
       "0      0      0       0  24.0  3.218876  \n",
       "1      0      0       0  21.6  3.117950  \n",
       "2      0      0       0  34.7  3.575151  \n",
       "3      0      0       0  33.4  3.538057  \n",
       "4      0      0       0  36.2  3.616309  \n",
       "\n",
       "[5 rows x 23 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fe_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 490 entries, 0 to 505\n",
      "Data columns (total 23 columns):\n",
      "CRIM        490 non-null float64\n",
      "ZN          490 non-null float64\n",
      "INDUS       490 non-null float64\n",
      "CHAS        490 non-null float64\n",
      "NOX         490 non-null float64\n",
      "RM          490 non-null float64\n",
      "AGE         490 non-null float64\n",
      "DIS         490 non-null float64\n",
      "RAD         490 non-null float64\n",
      "TAX         490 non-null float64\n",
      "PTRATIO     490 non-null float64\n",
      "LSTAT       490 non-null float64\n",
      "RAD_1       490 non-null uint8\n",
      "RAD_2       490 non-null uint8\n",
      "RAD_3       490 non-null uint8\n",
      "RAD_4       490 non-null uint8\n",
      "RAD_5       490 non-null uint8\n",
      "RAD_6       490 non-null uint8\n",
      "RAD_7       490 non-null uint8\n",
      "RAD_8       490 non-null uint8\n",
      "RAD_24      490 non-null uint8\n",
      "MEDV        490 non-null float64\n",
      "log_MEDV    490 non-null float64\n",
      "dtypes: float64(14), uint8(9)\n",
      "memory usage: 61.7 KB\n"
     ]
    }
   ],
   "source": [
    "fe_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
